# %% 扩展词表
import os
import json
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

threshold = 50
# %%

folder = "./extend_poi_dataset"
wordcount = {}
for filename in os.listdir(folder):
    with open(os.path.join(folder, filename), "r", encoding="utf-8") as f:
        for line in tqdm(f.readlines(), desc=filename):
            for char in line[:-1]:
                if char in wordcount:
                    wordcount[char] += 1
                else:
                    wordcount[char] = 1



# %%
tokenizer = BertTokenizer.from_pretrained("./chinese-roberta-wwm-ext")
# %%
valid_word = {c: count for c, count in wordcount.items() if count >= threshold and c != ""}
for word, count in valid_word.items():
    flag = tokenizer.add_tokens(word)
    if flag:
        print(word, count)

# %%
## 